Physiologically driven homogenization of marine ecosystems after the end-Permian mass extinction
Jood Al Aswad1*, Justin L. Penn2, Pedro Monarrez1,3, Mohamad Bazzi1, Curtis Deutsch2, Jonathan L. Payne1
1 Department of Earth and Planetary Sciences, Stanford University, 450 Jane Stanford Way, Stanford, CA 94305, USA. 2 Department of Geosciences, Princeton University, Guyot Hall, Princeton, NJ 08544, USA. 3 Department of Earth, Planetary, and Space Sciences, University of Los Angeles, CA 90095, USA.
Code compiled and curated by J.A.A. and M.B. Contact and Contact
Correspondence and requests for materials should be addressed to J.A.A Contact
#' @return calculate great circle distance in kilometers (km).#' @param R Earth mean radius (km)#' @param long1.r convert from degrees to radians for latitudes and longitudes.#' @exportgcd.slc <-function(long1, lat1, long2, lat2) { R <-6371 long1.r <- long1*pi/180 long2.r <- long2*pi/180 lat1.r <- lat1*pi/180 lat2.r <- lat2*pi/180 d <-acos(sin(lat1.r)*sin(lat2.r) +cos(lat1.r)*cos(lat2.r) *cos(long2.r-long1.r)) * Rreturn(d) }#' @return calculate jaccard similarity coefficient#' @param #' @param #' @exportjaccard_similarity <-function(x) { js_table <-list()for (k inseq_along(x)) {# Unique cells. unique_cells <-unique(x[[k]]$cell) jaccard_similarity_table <-data.frame(cell_x =character(), cell_y =character(), jaccard_similarity =numeric(), stringsAsFactors = F)for (i in1:length(unique_cells)) { cell_x <- unique_cells[i]# Cell_x unique_names_cell_x <-unique(x[[k]]$accepted_name[x[[k]]$cell == cell_x])for (j in1:length(unique_cells)) { cell_y <- unique_cells[j]# Duplicate comparisons.if (cell_x == cell_y || cell_x > cell_y) {next }# Cell_y unique_names_cell_y <-unique(x[[k]]$accepted_name[x[[k]]$cell == cell_y])# Intersections. intersection <-length(generics::intersect(unique_names_cell_x, unique_names_cell_y)) Un <-length(generics::union(unique_names_cell_x, unique_names_cell_y)) jaccard_similarity <- intersection/Un# Combine results. jaccard_similarity_table <-rbind(jaccard_similarity_table, data.frame(cell_x = cell_x, cell_y = cell_y, jaccard_similarity = jaccard_similarity)) } }# Results. js_table[[k]] <- jaccard_similarity_table }return(js_table)}#' @return calculate czekanowski similarity coefficient#' @param #' @param #' @exportczekanowski_similarity <-function(x) {2*abs(sum(x$minimum))/((sum(x$count_cell_x) +sum(x$count_cell_y)))}#' @return #' @param #' @param #' @export# Cross-join function.gridComb <-function(x, cell, accepted_name) { cA <-expand.grid(cell =unique(x$cell), unique(x$accepted_name)) |>setNames(nm =c("cell","accepted_name"))return(cA)}#' @return #' @param #' @param #' @export# Count taxon occurrence per uqniue cell combination.czekanowski_data_prep <-function(x, cell, accepted_name) { count_taxa_x <- x |>group_by(cell, accepted_name) |>summarize(count =n(), .groups ='drop') |>rename("cell_x"="cell", "count_cell_x"="count") count_taxa_y <- x |>group_by(cell, accepted_name) |>summarize(count =n(), .groups ='drop') |>rename("cell_y"="cell", "count_cell_y"="count")# Cell pairs. cell <-unique(x[[cell]]) taxa <-unique(x[[accepted_name]]) cell_combinations <-expand.grid(cell_x = cell, cell_y = cell,accepted_name = taxa) |>filter(cell_x != cell_y) result <- cell_combinations |>left_join(count_taxa_x, by =c("cell_x","accepted_name"), relationship ="many-to-many") |># Second join (y) left_join(count_taxa_y, by =c("cell_y", "accepted_name")) |>select("cell_x", "cell_y", "accepted_name", "count_cell_x", "count_cell_y") |># Replace NA with 0replace_na(replace =list(count_cell_x =0, count_cell_y =0)) |># Remove rows that at 0 in both count fields.filter(!(count_cell_x ==0& count_cell_y ==0)) |># Remove duplicated cell combinationsfilter(cell_x == cell_y | cell_x > cell_y) |># Split by cell combinationgroup_split(cell_x,cell_y)return(result)}# To calculate BC:BC <-function(genera, localities) { O <-length(genera) # occs N <-length(unique(genera)) # genera L <-length(unique(localities))return((O-N)/((L*N)-N))}
Paleobiology Database
The fossil occurrence data analysed in this study was retrieved from the Paleobiology Database on February of 2024. Data pre-processing made use of functions from the fossilbrush and CoordinateCleaner R packages.
Code
# Read occurrence dataset.pbdb <- pbdb <-read.csv(file ='~/Desktop/PT/Codes/pbdb.feb2024.csv')# Adjust radiometric agesinterval.ma <- pbdb |>group_by(early_interval) |>summarise(min_ma =min(min_ma))names(interval.ma) <-c("early_interval", "interval.ma")pbdb <-merge(pbdb, interval.ma, by=c("early_interval"))# Find first and last occurrences and merge back into data frame, using min_ma columnfadlad <- pbdb |>group_by(accepted_name) |>summarise(fad =max(interval.ma),lad =min(interval.ma) )# Merge fad and lad information into data framepbdb <-merge(pbdb, fadlad, by=c("accepted_name"))# Add extinction/survivor binary variablepbdb$ex <-0pbdb$ex[pbdb$interval.ma==pbdb$lad] <-1# Keep two classes and select the Induan and Changhsingian. pbdb <- pbdb |>filter(class %in%c("Gastropoda","Bivalvia"), interval.ma %in%c("252.17","251.2")) |># Identify Invalid Coordinates.cc_val(lat ="paleolat", lon ="paleolng")# Use fossilbrush to clean taxonomic errorsb_ranks <-c("phylum", "class", "order", "family", "accepted_name") #accepted_name is genus name# Define a list of suffixes to be used at each taxonomic level when scanning for synonymsb_suff =list(NULL, NULL, NULL, NULL, c("ina", "ella", "etta"))pbdb2 <-check_taxonomy(pbdb, suff_set = b_suff, ranks = b_ranks, verbose =FALSE,clean_name =TRUE, resolve_duplicates =TRUE, jump =5)
+ cleaning names at rank phylum
+ cleaning names at rank class
+ cleaning names at rank order
+ cleaning names at rank family
+ cleaning names at rank accepted_name
+ resolving duplicates at rank accepted_name
+ resolving duplicates at rank family
+ resolving duplicates at rank order
+ resolving duplicates at rank class
Code
# resolves homonyms, and jump refers to which taxonomic rank is the highest we resolve to. jump = 5 will stop before phylum since phylum level usually has little error.# Extract PBDB data from obdb2 so we have the corrected taxa:pbdb <- pbdb2$data[1:nrow(pbdb),]# Select variables.pbdb <- pbdb |>select(any_of(c("early_interval","interval.ma","occurrence_no","fad","lad","accepted_name","lat","long","ex","phylum","class","paleolat","paleolng","collection_no","reference_no")))
Visualization of Cells and Occurrences
The globe is divided into a grid of equal-area icosahedral hexagonal cells using the hexagrid() function in icosa. In hexagrid(deg = x), is roughly equivalent to longitudinal degrees, so that a degree of 1 is roughly equal to 111 km. This selects a tessellation vector, which translates to the amount of area you select for each cell. In our specified grid, each cell is roughly 629,000 km2 and results in a grid of 812 cells.
Code
# Get raw dataset before filtering for 20 occspbdb.2cha <- pbdb |>filter(interval.ma==252.17)pbdb.2ind <- pbdb |>filter(interval.ma==251.2)# Find raw locations for each stage:coords.cha <-subset(pbdb.2cha, select =c(paleolng, paleolat)) coords.cha <- coords.cha |>mutate_at(c('paleolng', 'paleolat'), as.numeric)coords.ind <-subset(pbdb.2ind, select =c(paleolng, paleolat)) coords.ind <- coords.ind |>mutate_at(c('paleolng', 'paleolat'), as.numeric)# Set up the gridhexa <-hexagrid(deg=4.5, sf=TRUE) #each deg = ~111 kmhexa
A/An hexagrid object with 1620 vertices, 2430 edges and 812 faces.
The mean grid edge length is 493.6 km or 4.44 degrees.
Use plot3d() to see a 3d render.
Code
# Find cell locations for each occurrencecells.cha <-locate(hexa,coords.cha) #str(cells.cha) #to see which cells have occ'scells.ind <-locate(hexa, coords.ind)#str(cells.ind)# Next add cells to coords in order to match cells with their coordinates:coords.cha$cell <- cells.cha names(coords.cha) <-c("long", "lat", "cell")coords.ind$cell <- cells.ind names(coords.ind) <-c("long", "lat", "cell")tcells.cha <-table(cells.cha) #to get no. of occupied cells#str(tcells.cha) #get frequency of cell occ'stcells.ind <-table(cells.ind)#str(tcells.ind)data.2cha <-cbind(pbdb.2cha, coords.cha) #assigns cell number for each occurrencedata.2ind <-cbind(pbdb.2ind, coords.ind)pbdb <-rbind(data.2cha, data.2ind)
Grid Plots
Next, visualize all occurrences for each stage, using the package rgplates and icosa on R. Please note that this version requires that you have the Gplates software installed in your computer, as it is the most optimal version of rgplates.
Code
# Call to Gplates offline (requires installed Gplates software)td <-tempdir() #temporary directory#tdrgPath <-system.file(package="rgplates")#list.files(rgPath) #confirm that this is the correct pathunzip(file.path(rgPath, "extdata/paleomap_v3.zip"), exdir=td)#list.files(file.path(td)) #confirm extraction has happened by looking at temporary directorypathToPolygons <-file.path(td, "PALEOMAP_PlatePolygons.gpml") #static plate polygonspathToRotations <-file.path(td, "PALEOMAP_PlateModel.rot")pm <-platemodel(features =c("static_polygons"= pathToPolygons),rotation = pathToRotations)# Plot it out:edge <-mapedge() #edge of the mapplates.pt<-reconstruct("static_polygons", age=252, model =pm)plot(edge, col ="lightblue2")plot(plates.pt$geometry, col ="gray60", border =NA, add =TRUE)plot(hexa, border="white",add =TRUE)gridlabs(hexa, cex=0.5) #get labels for each cell, labeled as spiral from North pole of grid
Changhsingian occurrences
Occurrences in the Changhsingian, with colors indicating the number of occurrences in occupied cells.
Code
#Permian:platesMoll <- sf::st_transform(plates.pt, "ESRI:54009")#^transform plates to Mollweide projection to plotplot(hexa, tcells.cha, crs ="ESRI:54009",border ="lightgrey",pal=c("#440154ff","darkorchid2","deepskyblue","royalblue", "goldenrod"),breaks =c(0, 20, 100, 200, 400, 600),reset=FALSE)plot(platesMoll$geometry, add =TRUE,border=NA, col ="#66666688")
Induan occurrences
Occurrences in the Induan, with colors indicating the number of occurrences in occupied cells.
Here, we follow the equation from Sidor et al. 2013 to investigate changes in Biogeographic Connectedness.
Code
# Calculate BC using the function we created earlierbc.pt <- pbdb |>group_by(early_interval) |>summarise(BC =BC(accepted_name, cell),count_accepted_name =n(),count_unique_accepted_name =n_distinct(accepted_name) )names(bc.pt) <-c("stage", "BC","occurrences","genera")#Visualize:bc.pt |>group_by(stage) |>ggplot(aes(x = stage, y = BC, fill = stage)) +scale_fill_manual(values =c("deepskyblue", "dodgerblue4")) +scale_color_manual(values =c("deepskyblue", "dodgerblue4")) +geom_bar(stat ="identity") +theme_minimal() +theme(plot.title =element_text(hjust =0.5, size =8, face ="bold"),axis.title =element_text(size =8,face ="bold"),axis.text =element_text(size =8),strip.text =element_text(face ="bold"),legend.position ="none",aspect.ratio =1)
Data pre-processing
We investigate the data by dividing it by stage and taxonomic class. We determine the number of cells and occurrences for each stage.
Code
# Data balance.pbdb |>group_by(class,early_interval) |>count() |>ggplot(mapping =aes(x = class, y = n, fill = class)) +geom_bar(stat ="identity") +labs(x =NULL, y ="Sample Size") +scale_fill_manual(values =c("#69b3a2","#404080")) +scale_color_manual(values =c("#69b3a2","#404080")) +facet_wrap(.~ early_interval, scales ="free") +theme_minimal() +theme(plot.title =element_text(hjust =0.5, size =8, face ="bold"),axis.title =element_text(size =8,face ="bold"),axis.text =element_text(size =8),strip.text =element_text(face ="bold"),legend.position ="none",aspect.ratio =1)
Code
# Set min occurrencesmin_occ <-20# Changhsingian cells.changhsingian_pbdb <- pbdb |>filter(early_interval =="Changhsingian")changhsingian_pbdb <- changhsingian_pbdb |># Cells.mutate(cell =locate(x = hexa,y = changhsingian_pbdb |>select("paleolng", "paleolat")))# Count occurrences per cell and filter by minimum occurrence.changhsingian_pbdb <- changhsingian_pbdb |>group_by(cell) |>count() |>setNames(nm =c("cell","occs")) |>inner_join(changhsingian_pbdb,by =c("cell")) |>filter(occs >= min_occ)# Cell centroids.changhsingian_centroid <-as.data.frame(centers(hexa))[names(table(changhsingian_pbdb$cell)),] |>rownames_to_column(var ="cell")# Add centroid to master dataframe: Longitude and Latitude.changhsingian_pbdb <- changhsingian_pbdb |>left_join(changhsingian_centroid, by ="cell")# Induan cellsinduan_pbdb <- pbdb |>filter(early_interval =="Induan")induan_pbdb <- induan_pbdb |># Cells.mutate(cell =locate(x = hexa,y = induan_pbdb |>select("paleolng", "paleolat")))# Count occurrences per cell and filter by minimum occurrence.induan_pbdb <- induan_pbdb |>group_by(cell) |>count() |>setNames(nm =c("cell","occs")) |>inner_join(induan_pbdb,by =c("cell")) |>filter(occs >= min_occ)# Cell centroidsinduan_centroid <-as.data.frame(centers(hexa))[names(table(induan_pbdb$cell)),] |>rownames_to_column(var ="cell")# Add centroid coordinates to master dataframe.induan_pbdb <- induan_pbdb |>left_join(induan_centroid, by ="cell")# Combine the two datasets: Changhsingian & Induan.# The pbdb dataset is has now been fully pre-processed.pbdb <-bind_rows(changhsingian_pbdb, induan_pbdb)# Create unique identifier for each cell.pbdb <-data.frame(unique(pbdb$cell)) |>setNames(nm ="cell") |>mutate(cell_id =c(1:length(cell))) |>inner_join(pbdb, by ="cell")# Plot number of occurrences per stage and cell.cell_text <-data.frame(label =c("N = 13 cells", "N = 20 cells"),early_interval =c("Changhsingian", "Induan"))pbdb |>group_by(early_interval,cell) |>count() |>ggplot(mapping =aes(x = cell, y = n)) +geom_col(col ="white", bg ="#53565A") +coord_flip() +geom_hline(yintercept =20, color ="#B83A4B") +labs(x =NULL, y ="Occurrences") +geom_text(data = cell_text, mapping =aes(x =c(12,18), y =100, label = label),hjust =-0.1, vjust =-0.1, size =3) +facet_wrap(.~ early_interval,scales ="free",nrow =1) +theme_bw() +theme(aspect.ratio =1,axis.text =element_text(size =8),axis.title =element_text(face ="bold"),strip.text =element_text(face ="bold"))
For each stage we create individual dataframes based on the cell units and store these into separate lists.
Code
# Data splitting based on cell id and stage.changhsingian_split <- pbdb |>filter(early_interval =="Changhsingian") |>group_split(cell_id) |>lapply(as.data.frame)induan_split <- pbdb |>filter(early_interval =="Induan") |>group_split(cell_id) |>lapply(as.data.frame)
Subsampling by cells and occurrence
Here we perform subsampling without replacement on our stage-level datasets using 999 iterations. For the Changhsingian we randomly sample 20 occurrences per cell and repeated the process as stated above. Conversely, for the Induan, we applied a two-step subsampling procedure by first subsampling down to 13 cells and then by occurrences. The results are subsampled datasets (cell-specific) saved as nested objects within a larger list. These are subsequently, merged into single master dataframes (i.e., the cells) to create one single list containing 999 dataframes.
As indicated in the previous section, we here combine cell-specific dataframes (N=13) into single joint dataframes (13*20 = 260 rows). This is repeated for all 999 sub-sampled dataframes. Worthy of note, the cells in the Induan list, will inevitably vary between the subsampled datasets, whereas, in the case of the Changhsingian they are all identical. This is because our analysis seeks to assess the impact by cell heterogeneity across geologic stages.
For each subsampled dataset in both the Induan and Changhsingian lists we here count the number of occurrence of each genera by cell. This is done for all dataframes and are then combined into one master dataframe.
Czekanowski equation following Miller et al., 2009
Czekanowski = 2 * \frac{\sum \min(x_{1k}, x_{2k})}{\sum x_{1k} + \sum x_{2k}} The occurrence of a given taxa between distinct cells are evaluated against each other.
Code
# Changhsingian.cha_combs <-gridComb(x = changhsingian_pbdb,cell = cell,accepted_name = accepted_name) # 13*221 = 2873 rows.# Induan.ind_combs <-gridComb(x = induan_pbdb,cell = cell,accepted_name = accepted_name) # 20*93 = 1860 rows. 190 unique cell pairs (check!)# Next count the occurrence of genera per unique cell. This will also include genera with no occurrence in any given cell (i.e. 0).# These are subsequently removed in the next step.countGen <-function(combinations, age_lists) { purrr::map(seq_along(age_lists), function(i) { name_counts <- combinations |>left_join(age_lists[[i]] |>group_by(cell, accepted_name) |>count(), by =c("cell", "accepted_name")) |># Replace NA with 0.replace_na(list(n =0))return(name_counts) })}# Changhsingian.cha_genCell <-countGen(combinations = cha_combs ,age_lists = combined_boot_changhsingian)# Induan.ind_genCell <-countGen(combinations = ind_combs ,age_lists = combined_boot_induan)# Create two identical count dataframes for each pair to join against.# Changhsingian.changhsingian_count_lsX <- purrr::map(cha_genCell, ~ .x |>rename("cell_x"="cell", "count_cell_x"="n"))changhsingian_count_lsY <- purrr::map(cha_genCell, ~ .x |>rename("cell_y"="cell", "count_cell_y"="n"))# Induan.induan_count_lsX <- purrr::map(ind_genCell, ~ .x |>rename("cell_x"="cell", "count_cell_x"="n"))induan_count_lsY <- purrr::map(ind_genCell, ~ .x |>rename("cell_y"="cell", "count_cell_y"="n"))# Merge counts for each cell pair.mCount <-function(cell_pairs, X, Y) { purrr::map(1:999, function(i) {# Rename the fields so that it matches. oG <- cell_pairs |>rename("cell_x"="x", "cell_y"="y") |># First join (x)left_join(X[[i]], by ="cell_x", relationship ="many-to-many") |># Second join (y) left_join(Y[[i]], by =c("cell_y", "accepted_name")) |>select("cell_x", "cell_y", "accepted_name", "count_cell_x", "count_cell_y")return(oG) })}# Changhsingian.cha_joined <-mCount(cell_pairs = cells_distinct_pair_ch,X = changhsingian_count_lsX, Y = changhsingian_count_lsY)# Induan.ind_joined <-mCount(cell_pairs = cells_distinct_pair_in,X = induan_count_lsX, Y = induan_count_lsY)# We then split based on distinct cell pairs. This will creates a nested list with X splits (78 for changhsingian and# 190 for the Induan) each dataframe i.e. 999. We also remove any genera (i.e. accepted name) were 0 occurrences is recorded between cell pairs.# This step also add a new field (the minimum field) which is based on the lowest number occurrences of a particular taxa between two cells.czekanowski_splits <-function(joined_lists) { purrr::map(1:999, function(i) { oP <- joined_lists[[i]] |># Removefilter(!(count_cell_x ==0& count_cell_y ==0)) |># Compute the minimum value between cell x and cell y (use count variable)mutate(minimum =pmin(count_cell_x, count_cell_y)) |>group_by(cell_x, cell_y) |>group_split()return(oP) })}cz_cha_prep <-czekanowski_splits(cha_joined)cz_ind_prep <-czekanowski_splits(ind_joined)# Compute the czekanowski index.changhsingian_czekanowski <-vector(mode ="list")for(i inseq_along(cz_cha_prep)) { cz <-lapply(cz_cha_prep[[i]], czekanowski_similarity) changhsingian_czekanowski[[i]] <- cz}induan_czekanowski <-vector(mode ="list")for(i inseq_along(cz_ind_prep)) { czI <-lapply(cz_ind_prep[[i]], czekanowski_similarity) induan_czekanowski[[i]] <- czI}# Cell pairs.pairs_cha <-do.call("rbind",lapply(cz_cha_prep[[1]], function(x) x[1:2][1,]))# Find all pairs in the Induanpairs_ind <-vector(mode ="list")for(i in1:999) { append_cells <-do.call("rbind",lapply(cz_ind_prep[[i]], function(x) x[1:2][1,])) pairs_ind[[i]] <- append_cells}# Reformat cha_cz_results <- purrr::map(changhsingian_czekanowski, ~as.data.frame(unlist(.x)) |>rename("cz"=1) |>cbind(pairs_cha) |>relocate(.after ="cell_y","cz") |>rename("x.cell"="cell_x", "y.cell"="cell_y") )ind_cz_results <- purrr::map(induan_czekanowski, ~as.data.frame(unlist(.x)) |>rename("cz"=1))# Now bind the cell pairs to the Induan datasets.ind_cz_results <-mapply(function(x, y) cbind(y, x), ind_cz_results, pairs_ind, SIMPLIFY =FALSE)# Compute the average czekanowski per cell paircha_czekanowski_dataframe <-bind_rows(cha_cz_results) |>rename("cell_x"="x.cell", "cell_y"="y.cell")ind_czekanowski_dataframe <-bind_rows(ind_cz_results)
Results
Code
# Changhsingianchanghsingian_res_matrix <- changhsingian_res_matrix |>rename("x.cell"="x","y.cell"="y") |>left_join(ave_changhsingian_jaccard,by =c("x.cell","y.cell"))# Induaninduan_res_matrix <- induan_res_matrix |>rename("x.cell"="x","y.cell"="y") |>left_join(ave_induan_jaccard,by =c("x.cell","y.cell"))# Bin by distance between cells (GCD in km's)changhsingian_res_matrix$cutdist <-cut(changhsingian_res_matrix$gcd,breaks =c(0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000), labels =c("0", "2000", "4000", "6000","8000", "10000", "12000","14000", "16000", "18000"),include.lowest =TRUE)induan_res_matrix$cutdist <-cut(induan_res_matrix$gcd,breaks =c(0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000), labels =c("0", "2000", "4000", "6000","8000", "10000", "12000","14000", "16000", "18000"),include.lowest =TRUE)# Average and sd for Changhsingian.sumRes_01 <- changhsingian_res_matrix |>group_by(cutdist) |>summarise(# Jaccardavg =mean(avg_jaccard, na.rm =TRUE),sdev =sd(avg_jaccard, na.rm =TRUE),n =n(),se = sdev/sqrt(n),# Quantilesfirst =quantile(avg_jaccard,probs=0.25),second =quantile(avg_jaccard,probs=0.975) ) |>mutate(label ='Changhsingian',label =as.factor(label)) |>mutate(cutdist = cutdist,cutdist =factor(cutdist,levels =c("0","2000","4000","6000","8000","10000","12000","14000","16000","18000","20000"))) |>mutate(ci = se *qt(.975, n -1), ci =as.numeric(ci)) |>as.data.frame() |>suppressWarnings() # This was added to ignore the last observation.# Average and sd for the Induan.sumRes_02 <- induan_res_matrix |>group_by(cutdist) |>summarise(# Jaccardavg =mean(avg_jaccard, na.rm =TRUE),sdev =sd(avg_jaccard, na.rm =TRUE),n =n(),se = sdev/sqrt(n),# Quantilesfirst =quantile(avg_jaccard,probs=0.25),second =quantile(avg_jaccard,probs=0.975) ) |>mutate(label ='Induan',label =as.factor(label)) |>mutate(cutdist = cutdist,cutdist =factor(cutdist,levels =c("0","2000","4000","6000","8000","10000","12000","14000","16000","18000","20000"))) |>mutate(ci = se *qt(.975, n -1), ci =as.numeric(ci)) |>as.data.frame() |>suppressWarnings() # This was added to ignore the last observation.# Combine the two.sumRes_03 <-bind_rows(sumRes_01,sumRes_02)# Plot.sumRes_03 |>ggplot(aes(x = cutdist, y = avg, group = label, colour = label, fill = label)) +geom_errorbar(aes(ymin =pmax(second), ymax=first), width=0.05,linewidth=1)+geom_line(linewidth =1.2) +scale_size_continuous(breaks =c(5,10,15,20,25,30)) +geom_point(aes(size = n),shape =21, fill ="white", stroke =2) +scale_fill_manual(values =c("deepskyblue","dodgerblue4")) +scale_color_manual(values =c("deepskyblue","dodgerblue4")) +labs(x ="Great Circle Distance (km)", y ="Similarity Value",title ="Jaccard",subtitle ="Subsampling by cells and occurrences", colour ="Stages", size ="Cell-pair comparison") +theme_classic() +theme(plot.title =element_text(face ="bold"),axis.title =element_text(face ="bold"),legend.title =element_text(face ="bold"),aspect.ratio =1)
Code
# Changhsingianchanghsingian_res_matrix <- changhsingian_res_matrix |>left_join( cha_czekanowski_dataframe |>rename("x.cell"="cell_x", "y.cell"="cell_y") |>group_by(x.cell,y.cell) |>summarise(avg_cz =mean(cz, na.rm =TRUE)), by =c("x.cell","y.cell")) |>relocate(.after ="avg_jaccard","avg_cz")# Induaninduan_res_matrix <- induan_res_matrix |>left_join( ind_czekanowski_dataframe |>rename("x.cell"="cell_x", "y.cell"="cell_y") |>group_by(x.cell,y.cell) |>summarise(avg_cz =mean(cz, na.rm =TRUE)), by =c("x.cell","y.cell")) |>relocate(.after ="avg_jaccard","avg_cz")# pairs_ind <- # pairs_ind |> # bind_rows() |> # filter(cell_x<cell_y) |> distinct(cell_x,cell_y)# Average and standard deviation for Changhsingian.sumRes_04 <- changhsingian_res_matrix |>group_by(cutdist) |>summarise(avg =mean(avg_cz, na.rm =TRUE),sdev =sd(avg_cz, na.rm =TRUE),n =n(),se = sdev/sqrt(n),# Quantilesfirst =quantile(avg_cz,probs=0.25),second =quantile(avg_cz,probs=0.975) ) |>mutate(label ='Changhsingian',label =as.factor(label)) |>mutate(cutdist = cutdist,cutdist =factor(cutdist,levels =c("0","2000","4000","6000","8000","10000","12000","14000","16000","18000","20000"))) |>mutate(ci = se *qt(.975, n -1), ci =as.numeric(ci)) |>as.data.frame() |>suppressWarnings() # This was added to ignore the last observation.sumRes_05 <- induan_res_matrix |>group_by(cutdist) |>summarise(avg =mean(avg_cz, na.rm =TRUE),sdev =sd(avg_cz, na.rm =TRUE),n =n(),se = sdev/sqrt(n),# Quantilesfirst =quantile(avg_cz,probs=0.25),second =quantile(avg_cz,probs=0.975) ) |>mutate(label ='Induan',label =as.factor(label)) |>mutate(cutdist = cutdist,cutdist =factor(cutdist,levels =c("0","2000","4000","6000","8000","10000","12000","14000","16000","18000","20000"))) |>mutate(ci = se *qt(.975, n -1), ci =as.numeric(ci)) |>as.data.frame() |>suppressWarnings()sumRes_06 <-bind_rows(sumRes_04,sumRes_05)# Plot.sumRes_06 |>ggplot(aes(x = cutdist, y = avg, group = label, colour = label, fill = label)) +geom_errorbar(aes(ymin =second, ymax=first), width=0.05,linewidth=1)+geom_line(linewidth =1.2) +scale_size_continuous(breaks =c(5,10,15,20,25,30)) +geom_point(aes(size = n),shape =21, fill ="white", stroke =2) +scale_fill_manual(values =c("deepskyblue","dodgerblue4")) +scale_color_manual(values =c("deepskyblue","dodgerblue4")) +labs(x ="Great Circle Distance (km)", y ="Similarity Value",title ="Czekanowski",subtitle ="Subsampling by cells and occurrences", colour ="Stages", size ="Cell-pair comparison") +theme_classic() +theme(plot.title =element_text(face ="bold"),axis.title =element_text(face ="bold"),legend.title =element_text(face ="bold"),aspect.ratio =1)
Count occurrences per genus in the Induan to determine whether they match the disaster taxa in literature (Claraia, Eumorphotis, etc).
Code
# find the top 5 genera based on number of occurrences genus_counts <- pbdb.2ind |>group_by(accepted_name) |>summarise(count =n()) |>top_n(5, wt = count) |>arrange(desc(count))# visualize itggplot(genus_counts, aes(x = accepted_name, y = count)) +theme_classic() +geom_bar(stat ="identity", fill ="dodgerblue3") +labs(x ="Genus Name", y ="Number of Occurrences")